import os
import sys
import gzip

from Bio.Seq import reverse_complement
from Bio import SeqIO

for readno in ("READ1", "READ2"):
    counter = 0
    sequences = set()
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/MiSeq/Fastq/"
    filenames = os.listdir(directory)
    for filename in sorted(filenames):
        basename, extension = os.path.splitext(filename)
        assert extension == ".gz"
        basename, extension = os.path.splitext(basename)
        assert extension == ".fq"
        if not basename.endswith(readno):
            continue
        path = os.path.join(directory, filename)
        print("Reading", path)
        handle = gzip.open(path, "rt")
        records = SeqIO.parse(handle, "fastq")
        for record in records:
            seq = str(record.seq)
            sequences.add(seq)
        handle.close()
        counter += 1
    print("%s: %d files read; %d unique sequences" % (readno, counter, len(sequences)))
    filename = "seqlist_%s.fa" % readno
    print("Writing %s" % filename)
    output = open(filename, 'w')
    sequences = sorted(sequences)
    if readno == "READ1":
        for i, sequence in enumerate(sequences):
            output.write(">%s_%d\n" % (readno, i))
            output.write("%s\n" % sequence)
    elif readno == "READ2":
        for i, sequence in enumerate(sequences):
            sequence = reverse_complement(sequence)
            output.write(">%s_%d\n" % (readno, i))
            output.write("%s\n" % sequence)
    output.close()

sequences1 = {}
filename = "seqlist_READ1.fa"
print("Reading %s" % filename)
handle = open(filename)
records = SeqIO.parse(handle, "fasta")
for i, record in enumerate(records):
    sequence = str(record.seq)
    readno, number = record.id.split("_")
    assert readno == "READ1"
    assert str(i) == number
    sequences1[sequence] = int(i)
handle.close()

sequences2 = {}
filename = "seqlist_READ2.fa"
print("Reading %s" % filename)
handle = open(filename)
records = SeqIO.parse(handle, "fasta")
for i, record in enumerate(records):
    sequence = str(record.seq)
    sequence = reverse_complement(sequence)
    readno, number = record.id.split("_")
    assert readno == "READ2"
    assert str(i) == number
    sequences2[sequence] = i
handle.close()


libraries = []
suffix = "_READ1"
directory = "/osc-fs_home/mdehoon/Data/CASPARs/MiSeq/Fastq/"
filenames = os.listdir(directory)
for filename in filenames:
    basename, extension = os.path.splitext(filename)
    assert extension == ".gz"
    basename, extension = os.path.splitext(basename)
    assert extension == ".fq"
    if not basename.endswith(suffix):
        continue
    library = basename[:-len(suffix)]
    libraries.append(library)

counter = 0
libraries.sort()
for library in libraries:
    filename = "%s_READ1.fq.gz" % library
    path = os.path.join(directory, filename)
    print("Reading", path)
    handle1 = gzip.open(path, "rt")
    records1 = SeqIO.parse(handle1, "fastq")
    filename = "%s_READ2.fq.gz" % library
    path = os.path.join(directory, filename)
    print("Reading", path)
    handle2 = gzip.open(path, "rt")
    records2 = SeqIO.parse(handle2, "fastq")
    filename = "%s.index.txt" % library
    print("Writing", filename)
    output = open(filename, 'w')
    for record1, record2 in zip(records1, records2):
        name1 = record1.id
        name2 = record2.id
        assert name1 == name2
        sequence1 = record1.seq
        sequence2 = record2.seq
        key1 = str(sequence1)
        key2 = str(sequence2)
        value1 = sequences1[key1]
        value2 = sequences2[key2]
        line = "%s\tREAD1_%d,READ2_%d\n" % (name1, value1, value2)
        output.write(line)
    output.close()
    handle1.close()
    handle2.close()
    counter += 1

print("Wrote %d libraries" % counter)
